In [ ]:
import pandas as pd

In [ ]:
texas_county_df = pd.read_excel("texas-data.xls", sheetname="Zika Texas", header=7)[['County', 'Case Counts']]
texas_county_df.columns = ['county', 'case_counts']
texas_county_df.to_csv("data/texas_data_county.csv", index=False, encoding='utf-8')

In [ ]:
cdc_may_16_countries_df = pd.read_excel("cdc-data.xls", sheetname="cdc 9-May-2016", header=7)[['Countries']]
cdc_may_16_countries_df.to_csv("data/CDC_May_2016_Countries.csv", index=False, encoding='utf-8')

In [ ]:
cdc_may_16_us_df = pd.read_excel("cdc-data.xls", sheetname="cdc US 4-May-2016", header=7)
cdc_may_16_us_df.drop(['Data', 'Map'], axis = 1, inplace=True)
cdc_may_16_us_df.drop([0, 1, 44, 45], axis = 0, inplace=True)
cdc_may_16_us_df.columns = ['states', 'travel_cases', 'local_cases']
cdc_may_16_us_df

In [ ]:
cdc_may_16_us_df['travel_cases'] = cdc_may_16_us_df['travel_cases'].apply(lambda x: x.split()[0])
cdc_may_16_us_df['local_cases'] = cdc_may_16_us_df['local_cases'].apply(lambda x: x.split()[0])
cdc_may_16_us_df

In [ ]:
cdc_may_16_us_df.to_csv("data/CDC_May_2016_US.csv", index=False)

In [ ]:
occurrence = pd.read_csv("GBIF Mosquito Occurrence Dataset.csv", sep="\t")
occurrence.columns

Should we only consider certain genuses or species?


In [ ]:
occurrence.groupby(['genus', 'species'])['datasetkey'].count()

There is a wide range of years of observation. Should we focus on a certain range?


In [ ]:
occurrence.groupby(['locality', 'year'])['datasetkey'].count()

In [ ]:
occurrence.groupby('year')['datasetkey'].count()

In [ ]:
occurrence[[u'eventdate', u'day', u'month', u'year', u'locality']][occurrence.year < 1900]

In [ ]: